Species: Explore

Get Species Observations

obs_csv <- file.path(dir_data, "obs.csv")
obs_geo <- file.path(dir_data, "obs.geojson")
redo <- F

# get species occurrence data from GBIF with coordinates
if(!file.exists(obs_geo) | redo){
  (res <- spocc::occ(
  query = 'Phoenicopterus ruber', 
  from = 'gbif', 
  has_coords = T, 
  limit = 10000
  ))
  
  df <- res$gbif$data[[1]] 
  nrow(df) # number of rows
  readr::write_csv(df, obs_csv)
  
  obs <- df %>% 
    sf::st_as_sf(
      coords = c("longitude", "latitude"),
      crs = st_crs(4326)) %>% 
    select(prov, key) # save space (joinable from obs_csv)
  sf::write_sf(obs, obs_geo, delete_dsn=T)
}
obs <- sf::read_sf(obs_geo)
nrow(obs)
## [1] 10000
# show points on map
mapview::mapview(obs, map.types = "OpenTopoMap")

Get Environmental Data

Presence

dir_env <- file.path(dir_data, "env")

# set a default data directory
options(sdmpredictors_datadir = dir_env)

# choosing marine
env_datasets <- sdmpredictors::list_datasets(terrestrial = T, marine = F)

# show table of datasets
env_datasets %>% 
  select(dataset_code, description, citation) %>% 
  DT::datatable()
# choose datasets for a vector
env_datasets_vec <- c("WorldClim", "ENVIREM")

# get layers
env_layers <- sdmpredictors::list_layers(env_datasets_vec)
DT::datatable(env_layers)
# choose layers after some inspection and perhaps consulting literature
env_layers_vec <- c("WC_alt", "WC_bio1", "WC_bio2", "ER_tri", "ER_topoWet")

# get layers
env_stack <- load_layers(env_layers_vec)

# interactive plot layers, hiding all but first (select others)
plot(env_stack, nc=2)

obs_hull_geo <- file.path(dir_data, "obs_hull.geojson")
env_stack_grd <- file.path(dir_data, "env_stack.grd")

if (!file.exists(obs_hull_geo) | redo){
  # make convex hull around points of observation
  obs_hull <- sf::st_convex_hull(st_union(obs))
  
  # save obs hull
  write_sf(obs_hull, obs_hull_geo)
}
obs_hull <- read_sf(obs_hull_geo)

# show points on map
mapview(
  list(obs, obs_hull))
if (!file.exists(env_stack_grd) | redo){
  obs_hull_sp <- sf::as_Spatial(obs_hull)
  env_stack <- raster::mask(env_stack, obs_hull_sp) %>% 
    raster::crop(extent(obs_hull_sp))
  writeRaster(env_stack, env_stack_grd, overwrite=T)  
}
env_stack <- stack(env_stack_grd)

# show map
# mapview(obs) + 
#   mapview(env_stack, hide = T) # makes html too big for Github
plot(env_stack, nc=2)

Pseudo-Absence

absence_geo <- file.path(dir_data, "absence.geojson")
pts_geo     <- file.path(dir_data, "pts.geojson")
pts_env_csv <- file.path(dir_data, "pts_env.csv")

if (!file.exists(absence_geo) | redo){
  # get raster count of observations
  r_obs <- rasterize(
    sf::as_Spatial(obs), env_stack[[1]], field=1, fun='count')
  
  # show map
  # mapview(obs) + 
  #   mapview(r_obs)
  
  # create mask for 
  r_mask <- mask(env_stack[[1]] > -Inf, r_obs, inverse=T)
  
  # generate random points inside mask
  absence <- dismo::randomPoints(r_mask, nrow(obs)) %>% 
    as_tibble() %>% 
    st_as_sf(coords = c("x", "y"), crs = 4326)
  
  write_sf(absence, absence_geo, delete_dsn=T)
}
absence <- read_sf(absence_geo)

# show map of presence, ie obs, and absence
mapview(obs, col.regions = "green") + 
  mapview(absence, col.regions = "gray")
if (!file.exists(pts_env_csv) | redo){

  # combine presence and absence into single set of labeled points 
  pts <- rbind(
    obs %>% 
      mutate(
        present = 1) %>% 
      select(present, key),
    absence %>% 
      mutate(
        present = 0,
        key     = NA)) %>% 
    mutate(
      ID = 1:n()) %>% 
    relocate(ID)
  write_sf(pts, pts_geo, delete_dsn=T)

  # extract raster values for points
  pts_env <- raster::extract(env_stack, as_Spatial(pts), df=TRUE) %>% 
    tibble() %>% 
    # join present and geometry columns to raster value results for points
    left_join(
      pts %>% 
        select(ID, present),
      by = "ID") %>% 
    relocate(present, .after = ID) %>% 
    # extract lon, lat as single columns
    mutate(
      #present = factor(present),
      lon = st_coordinates(geometry)[,1],
      lat = st_coordinates(geometry)[,2]) %>% 
    select(-geometry)
  write_csv(pts_env, pts_env_csv)
}
pts_env <- read_csv(pts_env_csv)

pts_env %>% 
  # show first 10 presence, last 10 absence
  slice(c(1:10, (nrow(pts_env)-9):nrow(pts_env))) %>% 
  DT::datatable(
    rownames = F,
    options = list(
      dom = "t",
      pageLength = 20))

Term Plots

pts_env %>% 
  select(-ID) %>% 
  mutate(
    present = factor(present)) %>% 
  pivot_longer(-present) %>% 
  ggplot() +
  geom_density(aes(x = value, fill = present)) + 
  scale_fill_manual(values = alpha(c("gray", "green"), 0.5)) +
  scale_x_continuous(expand=c(0,0)) +
  scale_y_continuous(expand=c(0,0)) +
  theme_bw() + 
  facet_wrap(~name, scales = "free") +
  theme(
    legend.position = c(1, 0),
    legend.justification = c(1, 0))

Species: Regress

librarian::shelf(
  DT, dplyr, dismo, GGally, here, readr, tidyr)
select <- dplyr::select # overwrite raster::select
options(readr.show_col_types = F)

dir_data    <- here("data/sdm")
pts_env_csv <- file.path(dir_data, "pts_env.csv")

pts_env <- read_csv(pts_env_csv)
nrow(pts_env)
## [1] 20000
datatable(pts_env, rownames = F)
GGally::ggpairs(
  select(pts_env, -ID),
  aes(color = factor(present)))

Logistic Regression

Setup Data

# setup model data
d <- pts_env %>% 
  select(-ID) %>%  # remove terms we don't want to model
  tidyr::drop_na() # drop the rows with NA values
nrow(d)
## [1] 19668

Linear Model

# fit a linear model
mdl <- lm(present ~ ., data = d)
summary(mdl)
## 
## Call:
## lm(formula = present ~ ., data = d)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.07599 -0.31329  0.07745  0.34631  1.44589 
## 
## Coefficients:
##                 Estimate   Std. Error t value            Pr(>|t|)    
## (Intercept)  1.570404107  0.046298365  33.919 <0.0000000000000002 ***
## WC_alt      -0.000077068  0.000007708  -9.998 <0.0000000000000002 ***
## WC_bio1     -0.039247738  0.000720435 -54.478 <0.0000000000000002 ***
## WC_bio2     -0.051579810  0.001299683 -39.686 <0.0000000000000002 ***
## ER_tri      -0.004271455  0.000224818 -19.000 <0.0000000000000002 ***
## ER_topoWet   0.033495629  0.003747848   8.937 <0.0000000000000002 ***
## lon         -0.004265912  0.000081128 -52.583 <0.0000000000000002 ***
## lat         -0.008736233  0.000171005 -51.087 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3898 on 19660 degrees of freedom
## Multiple R-squared:  0.3923, Adjusted R-squared:  0.3921 
## F-statistic:  1813 on 7 and 19660 DF,  p-value: < 0.00000000000000022
y_predict <- predict(mdl, pts_env, type="response")
y_true    <- pts_env$present

range(y_predict)
## [1] NA NA
range(y_true)
## [1] 0 1

Generalized Linear Model

# fit a generalized linear model with a binomial logit link function
mdl <- glm(present ~ ., family = binomial(link="logit"), data = d)
summary(mdl)
## 
## Call:
## glm(formula = present ~ ., family = binomial(link = "logit"), 
##     data = d)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.8932  -0.6316  -0.0778   0.8326   3.7967  
## 
## Coefficients:
##                Estimate  Std. Error z value             Pr(>|z|)    
## (Intercept)  8.95095496  0.37591201  23.811 < 0.0000000000000002 ***
## WC_alt      -0.00024986  0.00005422  -4.608       0.000004056799 ***
## WC_bio1     -0.30131529  0.00640733 -47.027 < 0.0000000000000002 ***
## WC_bio2     -0.41142538  0.01062817 -38.711 < 0.0000000000000002 ***
## ER_tri      -0.03652296  0.00218994 -16.678 < 0.0000000000000002 ***
## ER_topoWet   0.18605164  0.02954437   6.297       0.000000000303 ***
## lon         -0.03068770  0.00070107 -43.773 < 0.0000000000000002 ***
## lat         -0.06701779  0.00148996 -44.980 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 27261  on 19667  degrees of freedom
## Residual deviance: 17253  on 19660  degrees of freedom
## AIC: 17269
## 
## Number of Fisher Scoring iterations: 6
y_predict <- predict(mdl, d, type="response")

range(y_predict)
## [1] 0.0001601382 0.9910803961
# show term plots
termplot(mdl, partial.resid = TRUE, se = TRUE, main = F)

Generalized Additive Model

librarian::shelf(mgcv)

# fit a generalized additive model with smooth predictors
mdl <- mgcv::gam(
  formula = present ~ s(WC_alt) + s(WC_bio1) + 
    s(WC_bio2) + s(ER_tri) + s(ER_topoWet) + s(lon) + s(lat), 
  family = binomial, data = d)
summary(mdl)
## 
## Family: binomial 
## Link function: logit 
## 
## Formula:
## present ~ s(WC_alt) + s(WC_bio1) + s(WC_bio2) + s(ER_tri) + s(ER_topoWet) + 
##     s(lon) + s(lat)
## 
## Parametric coefficients:
##             Estimate Std. Error z value            Pr(>|z|)    
## (Intercept)  -2.7728     0.2715  -10.21 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                 edf Ref.df  Chi.sq             p-value    
## s(WC_alt)     8.870  8.983  692.12 <0.0000000000000002 ***
## s(WC_bio1)    7.632  8.178  463.75 <0.0000000000000002 ***
## s(WC_bio2)    8.917  8.997  366.79 <0.0000000000000002 ***
## s(ER_tri)     7.447  7.756   83.36 <0.0000000000000002 ***
## s(ER_topoWet) 7.237  8.130   55.26 <0.0000000000000002 ***
## s(lon)        8.997  9.000  632.41 <0.0000000000000002 ***
## s(lat)        8.878  8.989 1022.83 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.868   Deviance explained = 81.9%
## UBRE = -0.74255  Scale est. = 1         n = 19668
# show term plots
plot(mdl, scale=0)

Maximum Entropy

Maxent is probably the most commonly used species distrbution model since it performs well with few input data points, only reuires presence points and is easy to use with a Java graphical user interface (GUI).

# load extra packages
librarian::shelf(
  maptools, sf)

# show version of maxent
maxent()
## This is MaxEnt version 3.4.3
# get environmental rasters
# NOTE: the first part of Lab 1. SDM - Explore got updated to write this clipped environmental raster stack
env_stack_grd <- file.path(dir_data, "env_stack.grd")
env_stack <- stack(env_stack_grd)
plot(env_stack, nc=2)

# get presence-only observation points (maxent extracts raster values for you)
obs_geo <- file.path(dir_data, "obs.geojson")
obs_sp <- read_sf(obs_geo) %>% 
  sf::as_Spatial() # maxent prefers sp::SpatialPoints over newer sf::sf class

# fit a maximum entropy model
mdl <- maxent(env_stack, obs_sp)
## This is MaxEnt version 3.4.3
plot(mdl)

# plot term plots
response(mdl)

# predict
y_predict <- predict(env_stack, mdl) #, ext=ext, progress='')

plot(y_predict, main='Maxent, raw prediction')
data(wrld_simpl, package="maptools")
plot(wrld_simpl, add=TRUE, border='dark grey')

Species: Trees

# read data
pts_env <- read_csv(pts_env_csv)
d <- pts_env %>% 
  select(-ID) %>%                   # not used as a predictor x
  mutate(
    present = factor(present)) %>%  # categorical response
  na.omit()                         # drop rows with NA
skim(d)
Data summary
Name d
Number of rows 19668
Number of columns 8
_______________________
Column type frequency:
factor 1
numeric 7
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
present 0 1 FALSE 2 0: 9979, 1: 9689

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
WC_alt 0 1 414.38 545.31 -38.00 13.00 143.50 612.25 3995.00 ▇▂▁▁▁
WC_bio1 0 1 22.47 4.62 -0.90 18.40 24.10 26.30 30.60 ▁▁▃▅▇
WC_bio2 0 1 11.51 3.24 4.90 9.20 11.30 14.20 20.40 ▃▇▇▆▁
ER_tri 0 1 14.38 21.94 0.00 2.74 7.42 15.56 295.78 ▇▁▁▁▁
ER_topoWet 0 1 12.24 1.41 7.00 11.36 12.41 13.34 15.80 ▁▂▆▇▁
lon 0 1 -27.00 49.06 -117.14 -76.29 -7.71 19.12 40.04 ▃▆▁▂▇
lat 0 1 3.90 22.04 -34.71 -13.96 10.60 21.32 52.05 ▆▃▆▇▁

Split data into training and testing

# create training set with 80% of full data
d_split  <- rsample::initial_split(d, prop = 0.8, strata = "present")
d_train  <- rsample::training(d_split)

# show number of rows present is 0 vs 1
table(d$present)
## 
##    0    1 
## 9979 9689
table(d_train$present)
## 
##    0    1 
## 7983 7751

Decision Trees

Partition, depth = 1

# run decision stump model
mdl <- rpart(
  present ~ ., data = d_train, 
  control = list(
    cp = 0, minbucket = 5, maxdepth = 1))
mdl
## n= 15734 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 15734 7751 0 (0.50737257 0.49262743)  
##   2) WC_alt>=47.5 9320 1866 0 (0.79978541 0.20021459) *
##   3) WC_alt< 47.5 6414  529 1 (0.08247583 0.91752417) *
# plot tree 
par(mar = c(1, 1, 1, 1))
rpart.plot(mdl)

Partition, depth=default

# decision tree with defaults
mdl <- rpart(present ~ ., data = d_train)
mdl
## n= 15734 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 15734 7751 0 (0.50737257 0.49262743)  
##   2) WC_alt>=47.5 9320 1866 0 (0.79978541 0.20021459)  
##     4) lat>=-24.99979 7538  353 0 (0.95317060 0.04682940) *
##     5) lat< -24.99979 1782  269 1 (0.15095398 0.84904602) *
##   3) WC_alt< 47.5 6414  529 1 (0.08247583 0.91752417)  
##     6) lat>=30.20225 131   19 0 (0.85496183 0.14503817) *
##     7) lat< 30.20225 6283  417 1 (0.06636957 0.93363043) *
rpart.plot(mdl)

# plot complexity parameter
plotcp(mdl)

# rpart cross validation results
mdl$cptable
##           CP nsplit rel error    xerror        xstd
## 1 0.69100761      0 1.0000000 1.0000000 0.008090673
## 2 0.16049542      1 0.3089924 0.3089924 0.005813492
## 3 0.01199845      2 0.1484970 0.1492711 0.004223995
## 4 0.01000000      3 0.1364985 0.1372726 0.004063578

Feature interpretation

# caret cross validation results
mdl_caret <- train(
  present ~ .,
  data       = d_train,
  method     = "rpart",
  trControl  = trainControl(method = "cv", number = 10),
  tuneLength = 20)

ggplot(mdl_caret)

vip(mdl_caret, num_features = 40, bar = FALSE)

# Construct partial dependence plots
p1 <- partial(mdl_caret, pred.var = "lat") %>% autoplot()
p2 <- partial(mdl_caret, pred.var = "WC_bio2") %>% autoplot()
p3 <- partial(mdl_caret, pred.var = c("lat", "WC_bio2")) %>% 
  plotPartial(levelplot = FALSE, zlab = "yhat", drape = TRUE, 
              colorkey = TRUE, screen = list(z = -20, x = -60))

# Display plots side by side
gridExtra::grid.arrange(p1, p2, p3, ncol = 3)

Random Forests

Fit

# number of features
n_features <- length(setdiff(names(d_train), "present"))

# fit a default random forest model
mdl_rf <- ranger(present ~ ., data = d_train)

# get out of the box RMSE
(default_rmse <- sqrt(mdl_rf$prediction.error))
## [1] 0.1090188

Feature Interpretation

# re-run model with impurity-based variable importance
mdl_impurity <- ranger(
  present ~ ., data = d_train,
  importance = "impurity")

# re-run model with permutation-based variable importance
mdl_permutation <- ranger(
  present ~ ., data = d_train,
  importance = "permutation")
p1 <- vip::vip(mdl_impurity, bar = FALSE)
p2 <- vip::vip(mdl_permutation, bar = FALSE)

gridExtra::grid.arrange(p1, p2, nrow = 1)

Species: Evaluate

Setup

librarian::shelf(
  dismo, # species distribution modeling: maxent(), predict(), evaluate(), 
  dplyr, ggplot2, GGally, here, maptools, readr, 
  raster, readr, rsample, sf,
  usdm)  # uncertainty analysis for species distribution models: vifcor()
select = dplyr::select

# options
set.seed(42)
options(
  scipen = 999,
  readr.show_col_types = F)
ggplot2::theme_set(ggplot2::theme_light())

# paths
dir_data      <- here("data/sdm")
pts_geo       <- file.path(dir_data, "pts.geojson")
env_stack_grd <- file.path(dir_data, "env_stack.grd")
mdl_maxv_rds  <- file.path(dir_data, "mdl_maxent_vif.rds")

# read points of observation: presence (1) and absence (0)
pts <- read_sf(pts_geo)

# read raster stack of environment
env_stack <- raster::stack(env_stack_grd)

1.1 Split observations into training and testing

# create training set with 80% of full data
pts_split  <- rsample::initial_split(
  pts, prop = 0.8, strata = "present")
pts_train  <- rsample::training(pts_split)
pts_test   <- rsample::testing(pts_split)

pts_train_p <- pts_train %>% 
  filter(present == 1) %>% 
  as_Spatial()

pts_train_a <- pts_train %>% 
  filter(present == 0) %>% 
  as_Spatial()

2 Calibrate: Model Selection

# show pairs plot before multicollinearity reduction with vifcor()
pairs(env_stack)

# each point in the scatter plot on the lower triangle represents a pixel that is placed on the x and y axis based on the variables it is comparing. 
# the correlation is how tight the correspondance of the scatter plot is 
# the numbers are smaller where there is less correlation 

# the highest vif is the highest multicorrelation 


# calculate variance inflation factor per predictor, a metric of multicollinearity between variables
vif(env_stack)
##    Variables      VIF
## 1     WC_alt 1.959786
## 2    WC_bio1 1.488571
## 3    WC_bio2 1.559917
## 4     ER_tri 3.395112
## 5 ER_topoWet 3.661043
# stepwise reduce predictors, based on a max correlation of 0.7 (max 1)
v <- vifcor(env_stack, th=0.7) 
v
## 1 variables from the 5 input variables have collinearity problem: 
##  
## ER_topoWet 
## 
## After excluding the collinear variables, the linear correlation coefficients ranges between: 
## min correlation ( WC_bio2 ~ WC_bio1 ):  0.1079827 
## max correlation ( WC_bio1 ~ WC_alt ):  -0.4755387 
## 
## ---------- VIFs of the remained variables -------- 
##   Variables      VIF
## 1    WC_alt 1.930290
## 2   WC_bio1 1.451490
## 3   WC_bio2 1.417381
## 4    ER_tri 1.515366
# gives you a reduced set of predictors 


# reduce enviromental raster stack by 
env_stack_v <- usdm::exclude(env_stack, v)


# show pairs plot after multicollinearity reduction with vifcor()
pairs(env_stack_v)

# using the v above, none of them should have a correlation coefficient grater than 0.7 because of the threshold. 
# fit a maximum entropy model
if (!file.exists(mdl_maxv_rds)){
  mdl_maxv <- maxent(env_stack_v, sf::as_Spatial(pts_train))
  readr::write_rds(mdl_maxv, mdl_maxv_rds)
}
mdl_maxv <- read_rds(mdl_maxv_rds)

# plot variable contributions per predictor
plot(mdl_maxv)

# plot term plots
response(mdl_maxv)

# predict
y_maxv <- predict(env_stack, mdl_maxv) #, ext=ext, progress='')

plot(y_maxv, main='Maxent, raw prediction')
data(wrld_simpl, package="maptools")
plot(wrld_simpl, add=TRUE, border='dark grey')

Evaluate: Model Performance

3.1 Area Under the Curve (AUC), Reciever Operater Characteristic (ROC) Curve and Confusion Matrix

pts_test_p <- pts_test %>% 
  filter(present == 1) %>% 
  as_Spatial()
pts_test_a <- pts_test %>% 
  filter(present == 0) %>% 
  as_Spatial()

y_maxv <- predict(mdl_maxv, env_stack)
#plot(y_maxv)

# this is a prediction and youre comparing the prediction to our known presence and absence points. 
e <- dismo::evaluate(
  p     = pts_test_p,
  a     = pts_test_a, 
  model = mdl_maxv,
  x     = env_stack)
e
## class          : ModelEvaluation 
## n presences    : 1937 
## n absences     : 1993 
## AUC            : 0.9493625 
## cor            : 0.7697822 
## max TPR+TNR at : 0.652004
plot(e, 'ROC')

thr <- threshold(e)[['spec_sens']]
thr
## [1] 0.652004
# of the presences observed, how many true predictions are we getting. 
p_true <- na.omit(raster::extract(y_maxv, pts_test_p) >= thr)
# of the absences observed, how many true predictions are we getting 
# anything less than the threshold will be a 0
a_true <- na.omit(raster::extract(y_maxv, pts_test_a) < thr)


# TPR & TNR is true positive and negative rates 
# (t)rue/(f)alse (p)ositive/(n)egative rates
# p_true is a vector of trues and falses. 
# this gives us a rate 
tpr <- sum(p_true)/length(p_true)

# how many we observed were present but we predicted absence 
fnr <- sum(!p_true)/length(p_true)
fpr <- sum(!a_true)/length(a_true)
tnr <- sum(a_true)/length(a_true)
#the terms above populate the matrix below 
matrix(
  c(tpr, fnr,
    fpr, tnr), 
  nrow=2, dimnames = list(
    c("present_obs", "absent_obs"),
    c("present_pred", "absent_pred")))
##             present_pred absent_pred
## present_obs   0.90139391   0.1013547
## absent_obs    0.09860609   0.8986453
# add point to ROC plot
points(fpr, tpr, pch=23, bg="blue")

# any of the points along the curve represent a different value for 0 and 1 which is different than the axis 


# Ymax v is the predicted maxent from the environmental stack 
# applying the threshold to give you the distribution of 1s in green and 0s in grey for habitat no habitat. 
plot(y_maxv > thr)